library(readr)
kingcountysales <- read_csv("C:/Users/tjf4x/Desktop/R projects/King county home sales/kingcountysales_2000_2023.csv")
# Clean and transforming data
library(tidyverse)
library(lubridate)
glimpse(kingcountysales)
## Rows: 575,319
## Columns: 49
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ sale_id <chr> "1999..144", "1999..258", "1999..331", "1999..609", "…
## $ pinx <chr> "..2734100475", "..1535200725", "..6028000255", "..12…
## $ sale_date <chr> "1/5/1999", "1/5/1999", "1/4/1999", "1/11/1999", "1/7…
## $ sale_price <dbl> 150000, 235000, 293000, 178506, 270000, 184250, 17500…
## $ sale_nbr <dbl> 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, NA, 1, 1, 1, NA, 1,…
## $ sale_warning <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "34", NA,…
## $ join_status <chr> "demo", "demo", "demo", "demo", "demo", "demo", "demo…
## $ join_year <dbl> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999,…
## $ latitude <dbl> 47.54436, 47.42125, 47.57210, 47.68557, 47.76448, 47.…
## $ longitude <dbl> -122.3208, -122.4381, -122.1223, -122.1857, -122.2081…
## $ area <dbl> 78, 100, 31, 74, 38, 8, 18, 49, 70, 81, 79, 55, 69, 2…
## $ city <chr> "SEATTLE", "KING COUNTY", "KING COUNTY", "KIRKLAND", …
## $ zoning <chr> "SF 5000", "RA2.5P", "R6", "RS8.5", "R15 OP", "SF 720…
## $ subdivision <chr> "GEORGETOWN", "CHAUTAUQUA BEACH ADD", "NELSONS H E EA…
## $ present_use <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ land_val <dbl> 21000, 46000, 66000, 80000, 73000, 84000, 35000, 9100…
## $ imp_val <dbl> 66000, 132000, 207000, 69000, 119000, 75000, 122000, …
## $ year_built <dbl> 1900, 1916, 1986, 1961, 1924, 1920, 1955, 1986, 1967,…
## $ year_reno <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sqft_lot <dbl> 4000, 6695, 12360, 9450, 13600, 7200, 12554, 20300, 1…
## $ sqft <dbl> 1410, 990, 2020, 880, 2050, 790, 2160, 2230, 560, 151…
## $ sqft_1 <dbl> 760, 990, 1470, 880, 1300, 790, 1440, 1240, 560, 750,…
## $ sqft_fbsmt <dbl> 0, 0, 0, 0, 0, 0, 720, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ grade <dbl> 7, 6, 8, 6, 7, 7, 6, 9, 3, 7, 6, 5, 7, 8, 7, 7, 7, 6,…
## $ fbsmt_grade <dbl> 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,…
## $ condition <dbl> 3, 4, 3, 3, 5, 3, 3, 3, 3, 5, 3, 4, 3, 4, 5, 3, 3, 3,…
## $ stories <dbl> 1.5, 1.0, 2.0, 1.0, 1.5, 1.0, 1.0, 1.5, 1.0, 2.0, 1.0…
## $ beds <dbl> 3, 2, 3, 2, 5, 2, 4, 4, 1, 3, 3, 4, 4, 3, 3, 3, 3, 3,…
## $ bath_full <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
## $ bath_3qtr <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,…
## $ bath_half <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,…
## $ garb_sqft <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ gara_sqft <dbl> 0, 0, 480, 0, 0, 0, 0, 730, 0, 0, 0, 0, 480, 580, 0, …
## $ wfnt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ golf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ greenbelt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ noise_traffic <dbl> 2, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0,…
## $ view_rainier <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_olympics <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_cascades <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_territorial <dbl> 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_skyline <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_sound <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_lakewash <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_lakesamm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_otherwater <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ submarket <chr> "E", "H", "R", "Q", "Q", "C", "F", "G", "N", "E", "E"…
# Removing duplicates
distinct(kingcountysales)
## # A tibble: 575,319 × 49
## ...1 sale_id pinx sale_date sale_price sale_nbr sale_warning join_status
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 1999..144 ..27… 1/5/1999 150000 1 <NA> demo
## 2 2 1999..258 ..15… 1/5/1999 235000 1 <NA> demo
## 3 3 1999..331 ..60… 1/4/1999 293000 1 <NA> demo
## 4 4 1999..609 ..12… 1/11/1999 178506 1 <NA> demo
## 5 5 1999..775 ..19… 1/7/1999 270000 1 <NA> demo
## 6 6 1999..957 ..52… 1/15/1999 184250 1 <NA> demo
## 7 7 1999..1155 ..34… 1/19/1999 175000 1 <NA> demo
## 8 8 1999..1267 ..06… 1/11/1999 359850 NA <NA> demo
## 9 9 1999..1335 ..73… 1/7/1999 105000 1 <NA> demo
## 10 10 1999..1354 ..33… 1/14/1999 160000 1 <NA> demo
## # ℹ 575,309 more rows
## # ℹ 41 more variables: join_year <dbl>, latitude <dbl>, longitude <dbl>,
## # area <dbl>, city <chr>, zoning <chr>, subdivision <chr>, present_use <dbl>,
## # land_val <dbl>, imp_val <dbl>, year_built <dbl>, year_reno <dbl>,
## # sqft_lot <dbl>, sqft <dbl>, sqft_1 <dbl>, sqft_fbsmt <dbl>, grade <dbl>,
## # fbsmt_grade <dbl>, condition <dbl>, stories <dbl>, beds <dbl>,
## # bath_full <dbl>, bath_3qtr <dbl>, bath_half <dbl>, garb_sqft <dbl>, …
# selecting variables and converting date
king_time <- kingcountysales %>%
select(sale_date, sale_price, city) %>%
mutate(date = mdy(sale_date))
# converting cities to factors
unique(king_time$city)
## [1] "SEATTLE" "KING COUNTY" "KIRKLAND" "BOTHELL"
## [5] "NORMANDY PARK" "PACIFIC" "ISSAQUAH" "COVINGTON"
## [9] "KENT" "BELLEVUE" "SHORELINE" "FEDERAL WAY"
## [13] "REDMOND" "BURIEN" "MERCER ISLAND" "LAKE FOREST PARK"
## [17] "KENMORE" "AUBURN" "HUNTS POINT" "TUKWILA"
## [21] "RENTON" "MAPLE VALLEY" "SEA-TAC" "CLYDE HILL"
## [25] "MEDINA" "SNOQUALMIE" "ENUMCLAW" "NEWCASTLE"
## [29] "DES MOINES" "NORTH BEND" "BLACK DIAMOND" "WOODINVILLE"
## [33] "DUVALL" "CARNATION" "ALGONA" "YARROW POINT"
## [37] "BEAUX ARTS" "SKYKOMISH" "MILTON" "SAMMAMISH"
## [41] "SeaTac"
king_time$city <- as.factor(king_time$city)
# Checking for missing values
sum(is.na(king_time))
## [1] 0
# Calculating median by month
library(timetk)
king_monthly <-
king_time %>%
summarize_by_time(.date_var = date,
.by = "month",
median_sale = median(sale_price, na.rm = TRUE))
head(king_monthly, 12)
## # A tibble: 12 × 2
## date median_sale
## <date> <dbl>
## 1 1999-01-01 222000
## 2 1999-02-01 225475
## 3 1999-03-01 223250
## 4 1999-04-01 230000
## 5 1999-05-01 226000
## 6 1999-06-01 235000
## 7 1999-07-01 235000
## 8 1999-08-01 235920
## 9 1999-09-01 235000
## 10 1999-10-01 229970
## 11 1999-11-01 236250
## 12 1999-12-01 233800
# Plotting monthly median sale price
plot_time_series(king_monthly,
.date_var = date,
.value = median_sale,
.interactive = TRUE,
.x_lab = "Monthy Data",
.y_lab = "Median Sale price")
# Plotting monthly sales to find best months to buy or sell homes in king county
plot_seasonal_diagnostics(king_monthly, .date_var = date, .value = median_sale)
# Best month to sell by median home price is June
# Best month to buy is January
# Removing data near housing crisis to see if it changes monthly medians
king_monthly_post2012 <-
king_monthly %>%
filter_by_time(.date_var = date,
.start_date = "2013",
.end_date = "2024")
glimpse(king_monthly_post2012, 12)
## Rows: 132
## Columns: 2
## $ date <date> …
## $ median_sale <dbl> …
plot_seasonal_diagnostics(king_monthly_post2012, .date_var = date, .value = median_sale)
# This has similar results with June still being the best month to sell
# but December edging out January as the best month to buy.
# Building a simple univariant forecast model
# -------------------------------------------
# Creating testing and training split
library(rsample)
king_monthly_split <- initial_time_split(king_monthly_post2012, prop = 130/136)
king_training <- training(king_monthly_split)
king_testing <- testing(king_monthly_split)
# converting data to a tibble and setting index as date
library(fable)
library(tsibble)
king_training <-
king_training %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)
king_testing <-
king_testing %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)
king_monthly_post2012 <-
king_monthly_post2012 %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)
# Train the models
library(feasts)
king_fit <-
king_training %>%
model(stepwise = ARIMA(median_sale),
search = ARIMA(median_sale, stepwise=FALSE))
# Viewing model
tidy(king_fit)
## # A tibble: 6 × 6
## .model term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 stepwise ar1 0.857 0.0488 17.5 3.70e-34
## 2 stepwise sma1 -0.789 0.0945 -8.35 1.82e-13
## 3 stepwise constant 7027. 584. 12.0 4.84e-22
## 4 search ar1 0.857 0.0488 17.5 3.70e-34
## 5 search sma1 -0.789 0.0945 -8.35 1.82e-13
## 6 search constant 7027. 584. 12.0 4.84e-22
king_fit %>%
accuracy() %>%
arrange(MAPE)
## # A tibble: 2 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 stepwise Training -426. 18360. 13356. -0.188 1.98 0.251 0.283 0.0519
## 2 search Training -426. 18360. 13356. -0.188 1.98 0.251 0.283 0.0519
# Checking forecast of training data vs full data set
king_fit %>%
forecast(h = "6 months") %>%
autoplot(king_monthly_post2012)
# final fit on full data set
king_final_fit <-
king_monthly_post2012 %>%
model(stepwise = ARIMA(median_sale),
search = ARIMA(median_sale, stepwise=FALSE))
# Fitting final model on full data set
king_final_fit %>%
forecast(h = "12 months") %>%
autoplot(king_monthly_post2012) +
labs(x = "Month", y = "median home price")
king_final_fit %>%
forecast(h = "12 months") %>%
autoplot() +
labs(x = "Month", y = "median home price")
king_monthly_t <-
king_monthly %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)
# The data ends in December of 2023, but referencing median sales in recent
# we can see the forecast starts to get too optimistic. I added back in the housing
# crisis data and retrained.
king_final_fit_full <-
king_monthly_t %>%
model(stepwise = ARIMA(median_sale),
search = ARIMA(median_sale, stepwise=FALSE))
king_final_fit_full %>%
forecast(h = "12 months") %>%
autoplot(king_monthly_t) +
labs(x = "Month", y = "median home price")
king_final_fit_full %>%
forecast(h = "12 months") %>%
autoplot() +
labs(x = "Month", y = "median home price")
# Adding the full time series makes the model more conservative and makes the
# forecast more accurate.